{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    A   B   C   D\n",
      "0  a0  b0  c0  d0\n",
      "1  a1  b1  c1  d1\n",
      "2  a2  b2  c2  d2\n",
      "3  a3  b3  c3  d3 \n",
      "     A   B   C   D\n",
      "0  a4  b4  c4  d4\n",
      "1  a5  b5  c5  d5\n",
      "2  a6  b6  c6  d6\n",
      "3  a7  b7  c7  d7 \n",
      "      A    B    C    D\n",
      "0   a8   b8   c8   d8\n",
      "1   a9   b9   c9   d9\n",
      "2  a10  b10  c10  d10\n",
      "3  a11  b11  c11  d11\n"
     ]
    }
   ],
   "source": [
    "# 数据组合\n",
    "import pandas as pd\n",
    "df1 = pd.read_csv('../data/concat_1.csv')\n",
    "df2 = pd.read_csv('../data/concat_2.csv')\n",
    "df3 = pd.read_csv('../data/concat_3.csv')\n",
    "\n",
    "print(df1,'\\n',df2,'\\n',df3)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      A    B    C    D\n",
      "0    a0   b0   c0   d0\n",
      "1    a1   b1   c1   d1\n",
      "2    a2   b2   c2   d2\n",
      "3    a3   b3   c3   d3\n",
      "4    a4   b4   c4   d4\n",
      "5    a5   b5   c5   d5\n",
      "6    a6   b6   c6   d6\n",
      "7    a7   b7   c7   d7\n",
      "8    a8   b8   c8   d8\n",
      "9    a9   b9   c9   d9\n",
      "10  a10  b10  c10  d10\n",
      "11  a11  b11  c11  d11\n",
      "   0   1   2   3   4   5   6   7    8    9    10   11\n",
      "0  a0  b0  c0  d0  a4  b4  c4  d4   a8   b8   c8   d8\n",
      "1  a1  b1  c1  d1  a5  b5  c5  d5   a9   b9   c9   d9\n",
      "2  a2  b2  c2  d2  a6  b6  c6  d6  a10  b10  c10  d10\n",
      "3  a3  b3  c3  d3  a7  b7  c7  d7  a11  b11  c11  d11\n"
     ]
    }
   ],
   "source": [
    "row_concat_i = pd.concat([df1,df2,df3],ignore_index = True)\n",
    "print(row_concat_i)\n",
    "col_concat = pd.concat([df1,df2,df3],axis = 1,ignore_index = True)\n",
    "print(col_concat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     A    B    C    D    E    F    G    H\n",
      "0   a0   b0   c0   d0  NaN  NaN  NaN  NaN\n",
      "1   a1   b1   c1   d1  NaN  NaN  NaN  NaN\n",
      "2   a2   b2   c2   d2  NaN  NaN  NaN  NaN\n",
      "3   a3   b3   c3   d3  NaN  NaN  NaN  NaN\n",
      "0  NaN  NaN  NaN  NaN   a4   b4   c4   d4\n",
      "1  NaN  NaN  NaN  NaN   a5   b5   c5   d5\n",
      "2  NaN  NaN  NaN  NaN   a6   b6   c6   d6\n",
      "3  NaN  NaN  NaN  NaN   a7   b7   c7   d7\n",
      "0   a8  NaN   b8  NaN  NaN   c8  NaN   d8\n",
      "1   a9  NaN   b9  NaN  NaN   c9  NaN   d9\n",
      "2  a10  NaN  b10  NaN  NaN  c10  NaN  d10\n",
      "3  a11  NaN  b11  NaN  NaN  c11  NaN  d11\n",
      "Empty DataFrame\n",
      "Columns: []\n",
      "Index: [0, 1, 2, 3, 0, 1, 2, 3]\n"
     ]
    }
   ],
   "source": [
    "#不同索引下的数据集的合并\n",
    "df1.columns = ['A','B','C','D']\n",
    "df2.columns = ['E','F','G','H']\n",
    "df3.columns = ['A','C','F','H']\n",
    "\n",
    "row_concat = pd.concat([df1,df2,df3])\n",
    "print(row_concat)\n",
    "\n",
    "print(pd.concat([df1,df2],ignore_index = False,join = 'inner'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      ident   personal    family\n",
      "0      dyer    William      Dyer\n",
      "1        pb      Frank   Pabodie\n",
      "2      lake   Anderson      Lake\n",
      "3       roe  Valentina   Roerich\n",
      "4  danforth      Frank  Danforth \n",
      "     name    lat    long\n",
      "0   DR-1 -49.85 -128.57\n",
      "1   DR-3 -47.15 -126.72\n",
      "2  MSK-4 -48.87 -123.40 \n",
      "    ident   site       dated\n",
      "0    619   DR-1  1927-02-08\n",
      "1    622   DR-1  1927-02-10\n",
      "2    734   DR-3  1939-01-07\n",
      "3    735   DR-3  1930-01-12\n",
      "4    751   DR-3  1930-02-26\n",
      "5    752   DR-3         NaN\n",
      "6    837  MSK-4  1932-01-14\n",
      "7    844   DR-1  1932-03-22\n",
      "\n",
      "     taken person quant  reading\n",
      "0     619   dyer   rad     9.82\n",
      "1     619   dyer   sal     0.13\n",
      "2     622   dyer   rad     7.80\n",
      "3     622   dyer   sal     0.09\n",
      "4     734     pb   rad     8.41\n",
      "5     734   lake   sal     0.05\n",
      "6     734     pb  temp   -21.50\n",
      "7     735     pb   rad     7.22\n",
      "8     735    NaN   sal     0.06\n",
      "9     735    NaN  temp   -26.00\n",
      "10    751     pb   rad     4.35\n",
      "11    751     pb  temp   -18.50\n",
      "12    751   lake   sal     0.10\n",
      "13    752   lake   rad     2.19\n",
      "14    752   lake   sal     0.09\n",
      "15    752   lake  temp   -16.00\n",
      "16    752    roe   sal    41.60\n",
      "17    837   lake   rad     1.46\n",
      "18    837   lake   sal     0.21\n",
      "19    837    roe   sal    22.50\n",
      "20    844    roe   rad    11.25\n"
     ]
    }
   ],
   "source": [
    "# 合并多个数据集\n",
    "person = pd.read_csv('../data/survey_person.csv')\n",
    "site = pd.read_csv('../data/survey_site.csv')\n",
    "survey = pd.read_csv('../data/survey_survey.csv')\n",
    "visited = pd.read_csv('../data/survey_visited.csv')\n",
    "print(person,'\\n',site,'\\n',visited)\n",
    "print('\\n',survey)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    name    lat    long  ident   site       dated\n",
      "0   DR-1 -49.85 -128.57    619   DR-1  1927-02-08\n",
      "1   DR-3 -47.15 -126.72    734   DR-3  1939-01-07\n",
      "2  MSK-4 -48.87 -123.40    837  MSK-4  1932-01-14\n",
      "\n",
      "     name    lat    long  ident   site       dated\n",
      "0   DR-1 -49.85 -128.57    619   DR-1  1927-02-08\n",
      "1   DR-1 -49.85 -128.57    622   DR-1  1927-02-10\n",
      "2   DR-1 -49.85 -128.57    844   DR-1  1932-03-22\n",
      "3   DR-3 -47.15 -126.72    734   DR-3  1939-01-07\n",
      "4   DR-3 -47.15 -126.72    735   DR-3  1930-01-12\n",
      "5   DR-3 -47.15 -126.72    751   DR-3  1930-02-26\n",
      "6   DR-3 -47.15 -126.72    752   DR-3         NaN\n",
      "7  MSK-4 -48.87 -123.40    837  MSK-4  1932-01-14\n",
      "   ident   personal   family  taken person quant  reading\n",
      "0   dyer    William     Dyer    619   dyer   rad     9.82\n",
      "1   dyer    William     Dyer    619   dyer   sal     0.13\n",
      "2   dyer    William     Dyer    622   dyer   rad     7.80\n",
      "3   dyer    William     Dyer    622   dyer   sal     0.09\n",
      "4     pb      Frank  Pabodie    734     pb   rad     8.41\n",
      "5     pb      Frank  Pabodie    734     pb  temp   -21.50\n",
      "6     pb      Frank  Pabodie    735     pb   rad     7.22\n",
      "7     pb      Frank  Pabodie    751     pb   rad     4.35\n",
      "8     pb      Frank  Pabodie    751     pb  temp   -18.50\n",
      "9   lake   Anderson     Lake    734   lake   sal     0.05\n",
      "10  lake   Anderson     Lake    751   lake   sal     0.10\n",
      "11  lake   Anderson     Lake    752   lake   rad     2.19\n",
      "12  lake   Anderson     Lake    752   lake   sal     0.09\n",
      "13  lake   Anderson     Lake    752   lake  temp   -16.00\n",
      "14  lake   Anderson     Lake    837   lake   rad     1.46\n",
      "15  lake   Anderson     Lake    837   lake   sal     0.21\n",
      "16   roe  Valentina  Roerich    752    roe   sal    41.60\n",
      "17   roe  Valentina  Roerich    837    roe   sal    22.50\n",
      "18   roe  Valentina  Roerich    844    roe   rad    11.25\n",
      "\n",
      "     ident   site       dated  taken person quant  reading\n",
      "0     619   DR-1  1927-02-08    619   dyer   rad     9.82\n",
      "1     619   DR-1  1927-02-08    619   dyer   sal     0.13\n",
      "2     622   DR-1  1927-02-10    622   dyer   rad     7.80\n",
      "3     622   DR-1  1927-02-10    622   dyer   sal     0.09\n",
      "4     734   DR-3  1939-01-07    734     pb   rad     8.41\n",
      "5     734   DR-3  1939-01-07    734   lake   sal     0.05\n",
      "6     734   DR-3  1939-01-07    734     pb  temp   -21.50\n",
      "7     735   DR-3  1930-01-12    735     pb   rad     7.22\n",
      "8     735   DR-3  1930-01-12    735    NaN   sal     0.06\n",
      "9     735   DR-3  1930-01-12    735    NaN  temp   -26.00\n",
      "10    751   DR-3  1930-02-26    751     pb   rad     4.35\n",
      "11    751   DR-3  1930-02-26    751     pb  temp   -18.50\n",
      "12    751   DR-3  1930-02-26    751   lake   sal     0.10\n",
      "13    752   DR-3         NaN    752   lake   rad     2.19\n",
      "14    752   DR-3         NaN    752   lake   sal     0.09\n",
      "15    752   DR-3         NaN    752   lake  temp   -16.00\n",
      "16    752   DR-3         NaN    752    roe   sal    41.60\n",
      "17    837  MSK-4  1932-01-14    837   lake   rad     1.46\n",
      "18    837  MSK-4  1932-01-14    837   lake   sal     0.21\n",
      "19    837  MSK-4  1932-01-14    837    roe   sal    22.50\n",
      "20    844   DR-1  1932-03-22    844    roe   rad    11.25\n",
      "\n",
      " ident_x           dyer\n",
      "personal       William\n",
      "family            Dyer\n",
      "taken_x            619\n",
      "person_x          dyer\n",
      "quant              rad\n",
      "reading           9.82\n",
      "ident_y            619\n",
      "site              DR-1\n",
      "dated       1927-02-08\n",
      "taken_y            619\n",
      "person_y          dyer\n",
      "Name: 0, dtype: object\n"
     ]
    }
   ],
   "source": [
    "# one to one\n",
    "visited_subset = visited.loc[[0,2,6],]\n",
    "o2o_merge = site.merge(visited_subset,left_on = 'name',right_on = 'site')\n",
    "print(o2o_merge)\n",
    "\n",
    "# more to one\n",
    "m2o_merge = site.merge(visited,left_on = 'name',right_on = 'site')\n",
    "print('\\n',m2o_merge)\n",
    "\n",
    "# more to more\n",
    "ps = person.merge(survey,left_on = 'ident',right_on = 'person')\n",
    "vs = visited.merge(survey,left_on = 'ident',right_on = 'taken')\n",
    "print(ps)\n",
    "print('\\n',vs)\n",
    "ps_vs = ps.merge(vs,left_on = ['ident','taken','quant','reading'],\n",
    "                 right_on = ['person','ident','quant','reading'])\n",
    "print('\\n',ps_vs.loc[0,])\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env_py38",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
